{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# preprocessing imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from time import time\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.preprocessing.text import text_to_word_sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# functions we implemented\n",
    "from custom_functions import init_embeddings_map, get_embed_and_pad_func"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emb_size = 50\n",
    "embedding_map = init_embeddings_map(\"glove.6B.\" + str(emb_size) + \"d.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = pd.read_csv(\"data/unembedded_grouped_cleaned_data.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train/test split for our model is unique, we need to hold out a\n",
    "# set of users and movies so that our network never learns those \n",
    "test_size = 0.005\n",
    "\n",
    "# get test_size percentage of users\n",
    "unique_users = raw_data.loc[:, \"reviewerID\"].unique()\n",
    "users_size = len(unique_users)\n",
    "test_idx = np.random.choice(users_size,\n",
    "                              size=int(users_size * test_size),\n",
    "                              replace=False)\n",
    "\n",
    "# get test users\n",
    "test_users = unique_users[test_idx]\n",
    "\n",
    "# everyone else is a training user\n",
    "train_users = np.delete(unique_users, test_idx)\n",
    "\n",
    "test = raw_data[raw_data[\"reviewerID\"].isin(test_users)]\n",
    "train = raw_data[raw_data[\"reviewerID\"].isin(train_users)]\n",
    "\n",
    "unique_test_movies = test[\"asin\"].unique()\n",
    "\n",
    "# drop the movies that also appear in our test set. In order to be\n",
    "# a true train/test split, we are forced to discard some data entirely\n",
    "train = train.where(np.logical_not(train[\"asin\"].isin(unique_test_movies))).dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_seq_sizes = raw_data.loc[:, \"userReviews\"].apply(lambda x: x.split()).apply(len)\n",
    "item_seq_sizes = raw_data.loc[:, \"movieReviews\"].apply(lambda x: x.split()).apply(len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "u_ptile = 40\n",
    "i_ptile = 15\n",
    "u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))\n",
    "i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * emb_size), embedding_map)\n",
    "    \n",
    "train_embedded = train.apply(embedding_fn, axis=1)\n",
    "test_embedded = test.apply(embedding_fn, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DeepCoNN Recommendation Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# modeling imports\n",
    "import tensorflow as tf\n",
    "from keras.models import Model\n",
    "from keras.callbacks import EarlyStopping, TensorBoard\n",
    "from keras.layers import Conv1D, MaxPooling1D, Flatten\n",
    "from keras.layers import Input, Dense\n",
    "from keras.layers.merge import Add, Dot, Concatenate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DeepCoNN():\n",
    "    def __init__(self,\n",
    "                 embedding_size,\n",
    "                 hidden_size,\n",
    "                 u_seq_len,\n",
    "                 m_seq_len,\n",
    "                 filters=2,\n",
    "                 kernel_size=10,\n",
    "                 strides=6):\n",
    "        self.embedding_size = embedding_size\n",
    "        self.hidden_size = hidden_size\n",
    "        self.filters = filters\n",
    "        self.kernel_size = kernel_size\n",
    "        self.inputU, self.towerU = self.create_deepconn_tower(u_seq_len)\n",
    "        self.inputM, self.towerM = self.create_deepconn_tower(m_seq_len)\n",
    "        self.joined = Concatenate()([self.towerU, self.towerM])\n",
    "        self.outNeuron = Dense(1)(self.joined)\n",
    "\n",
    "    def create_deepconn_tower(self, max_seq_len):\n",
    "        input_layer = Input(shape=(max_seq_len, self.embedding_size))\n",
    "        tower = Conv1D(filters=self.filters,\n",
    "                       kernel_size=self.kernel_size,\n",
    "                       activation=\"tanh\")(input_layer)\n",
    "        tower = MaxPooling1D()(tower)\n",
    "        tower = Flatten()(tower)\n",
    "        tower = Dense(self.hidden_size, activation=\"relu\")(tower)\n",
    "        return input_layer, tower\n",
    "\n",
    "    def create_deepconn_dp(self):\n",
    "        dotproduct = Dot(axes=1)([self.towerU, self.towerM])\n",
    "        output = Add()([self.outNeuron, dotproduct])\n",
    "        self.model = Model(inputs=[self.inputU, self.inputM], outputs=[output])\n",
    "        self.model.compile(optimizer='Adam', loss='mse')\n",
    "        \n",
    "    def train(self, train_data, batch_size, epochs=3500):\n",
    "        tensorboard = TensorBoard(log_dir=\"tf_logs/{}\".format(pd.Timestamp(int(time()), unit=\"s\")))\n",
    "        self.create_deepconn_dp()\n",
    "        print(self.model.summary())\n",
    "        \n",
    "        user_reviews = np.array(list(train_data.loc[:, \"userReviews\"]))\n",
    "        movie_reviews = np.array(list(train_data.loc[:, \"movieReviews\"]))\n",
    "\n",
    "        self.train_inputs = [user_reviews, movie_reviews]\n",
    "        self.train_outputs = train_data.loc[:, \"overall\"]\n",
    "        \n",
    "        self.history = self.model.fit(self.train_inputs,\n",
    "                                      self.train_outputs,\n",
    "                                      callbacks=[tensorboard],\n",
    "                                      validation_split=0.05,\n",
    "                                      batch_size=batch_size,\n",
    "                                      epochs=epochs)\n",
    "        \n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "hidden_size = 64\n",
    "deepconn = DeepCoNN(emb_size, hidden_size, u_seq_len, i_seq_len)\n",
    "\n",
    "batch_size = 32\n",
    "deepconn.train(train_embedded, batch_size, epochs=1)\n",
    "\n",
    "deepconn.model.save(\"cnn.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_reviews = np.array(list(test_embedded.loc[:, \"userReviews\"]))\n",
    "movie_reviews = np.array(list(test_embedded.loc[:, \"movieReviews\"]))\n",
    "\n",
    "test_inputs = [user_reviews, movie_reviews]\n",
    "\n",
    "dat = pd.DataFrame(test_inputs)\n",
    "dat.to_csv(\"data/test_data.csv\")\n",
    "\n",
    "true_rating = np.array(list(test_embedded.loc[:, \"overall\"])).reshape((-1, 1))\n",
    "\n",
    "predictions = deepconn.model.predict(test_inputs)\n",
    "\n",
    "error = np.square(predictions - true_rating)\n",
    "\n",
    "print(\"MSE:\", np.average(error))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
